In [1]:
import glob
import os
import subprocess
import cdpybio as cpb
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import ciepy
import cardipspy as cpy
%matplotlib inline
In [2]:
outdir = os.path.join(ciepy.root, 'output',
'input_data')
cpy.makedir(outdir)
private_outdir = os.path.join(ciepy.root, 'private_output',
'input_data')
cpy.makedir(private_outdir)
In [3]:
dy = '/projects/CARDIPS/data/database/20160129'
fn = os.path.join(dy, 'baseline_analyte.tsv')
baseline_analyte = pd.read_table(fn, index_col=1)
fn = os.path.join(dy, 'baseline_wgsisaac.tsv')
baseline_wgsisaac = pd.read_table(fn, index_col=0)
fn = os.path.join(dy, 'baseline_ipsc.tsv')
baseline_ipsc = pd.read_table(fn, index_col=0)
fn = os.path.join(dy, 'baseline_wgs.tsv')
baseline_wgs = pd.read_table(fn, index_col=0)
fn = os.path.join(dy, 'baseline_cnv.tsv')
baseline_cnv = pd.read_table(fn, index_col=0)
fn = os.path.join(dy, 'baseline_rnas.tsv')
baseline_rnas = pd.read_table(fn, index_col=0)
fn = os.path.join(dy, 'baseline_ibd.tsv')
baseline_ibd = pd.read_table(fn, index_col=0)
fn = os.path.join(dy, 'baseline_snpa.tsv')
baseline_snpa = pd.read_table(fn, index_col=0)
fn = os.path.join(dy, 'baseline_tissue.tsv')
baseline_tissue = pd.read_table(fn, index_col=1)
fn = os.path.join(dy, 'family1070_rnas.tsv')
family1070_rnas = pd.read_table(fn, index_col=0)
fn = os.path.join(dy, 'family1070_tissue.tsv')
family1070_tissue = pd.read_table(fn, index_col=1)
fn = os.path.join(dy, 'subject_pedigree.tsv')
subject_pedigree = pd.read_table(fn, index_col=0)
fn = os.path.join(dy, 'subject_family.tsv')
subject_family = pd.read_table(fn, index_col=0)
fn = os.path.join(dy, 'subject_subject.tsv')
subject_subject = pd.read_table(fn, index_col=3)
# The only columns that I should use from data_* tables
# are the seq_id and status columns. Others may be wrong.
fn = os.path.join(dy, 'data_wgs.tsv')
data_wgs = pd.read_table(fn, index_col=2)
fn = os.path.join(dy, 'data_snpa.tsv')
data_snpa = pd.read_table(fn, index_col=0)
fn = os.path.join(dy, 'data_array.tsv')
data_array = pd.read_table(fn, index_col=0)
# fn = os.path.join(dy, 'data_chips.tsv')
# data_chips = pd.read_table(fn, index_col=0)
fn = os.path.join(dy, 'data_atacs.tsv')
data_atacs = pd.read_table(fn, index_col=0)
# fn = os.path.join(dy, 'data_metha.tsv')
# data_metha = pd.read_table(fn, index_col=0)
# fn = os.path.join(dy, 'data_hic.tsv')
# data_hic = pd.read_table(fn, index_col=0)
fn = os.path.join(dy, 'data_rnas.tsv')
data_rnas = pd.read_table(fn, index_col=2)
fn = os.path.join(dy, 'data_sequence.tsv')
data_sequence = pd.read_table(fn, index_col=0)
In [4]:
cnv = baseline_cnv.merge(baseline_snpa, left_on='snpa_id', right_index=True,
suffixes=['_cnv', '_snpa'])
cnv = cnv.merge(baseline_analyte, left_on='analyte_id', right_index=True,
suffixes=['_cnv', '_analyte'])
cnv = cnv.merge(baseline_tissue, left_on='tissue_id', right_index=True,
suffixes=['_cnv', '_tissue'])
cnv = cnv[['type', 'chr', 'start', 'end', 'len', 'primary_detect_method',
'clone', 'passage', 'subject_id']]
In [5]:
# Get family1070 samples.
tdf = family1070_rnas[family1070_rnas.comment.isnull()]
tdf = tdf.merge(family1070_tissue, left_on='tissue_id', right_index=True,
suffixes=['_rna', '_tissue'])
tdf = tdf[tdf.cell_type == 'iPSC']
tdf.index = tdf.rnas_id
tdf['status'] = data_rnas.ix[tdf.index, 'status']
tdf = tdf[tdf.status == 0]
tdf = tdf[['ipsc_clone_number', 'ipsc_passage', 'subject_id']]
tdf.columns = ['clone', 'passage', 'subject_id']
tdf['isolated_by'] = 'p'
tdf.index.name = 'rna_id'
# Get the iPSC eQTL samples.
rna = baseline_rnas[baseline_rnas.rnas_id.isnull() == False]
rna.index = rna.rnas_id
rna.index.name = 'rna_id'
rna['status'] = data_rnas.ix[rna.index, 'status']
rna = rna[rna.status == 0]
#rna = rna.ix[censor[censor == False].index]
rna = rna.merge(baseline_analyte, left_on='analyte_id', right_index=True,
suffixes=['_rnas', '_analyte'])
rna = rna.merge(baseline_tissue, left_on='tissue_id', right_index=True,
suffixes=['_rnas', '_tissue'])
rna = rna[['clone', 'passage', 'subject_id']]
rna['isolated_by'] = 'a'
rna = pd.concat([rna, tdf])
In [6]:
# Get 222 subjects.
cohort222 = baseline_ipsc.merge(baseline_tissue, left_on='tissue_id',
right_index=True, suffixes=['_ipsc', '_tissue'])
n = len(set(rna.subject_id) & set(cohort222.subject_id))
print('We have {} of the 222 subjects in the "222 cohort."'.format(n))
In [7]:
rna['sequence_id'] = data_rnas.ix[rna.index, 'sequence_id']
In [8]:
rna['in_eqtl'] = False
In [9]:
samples = (cohort222.subject_id + ':' + cohort222.clone.astype(int).astype(str) +
':' + cohort222.passage.astype(int).astype(str))
t = rna.dropna(subset=['passage'])
t.loc[:, ('sample')] = (t.subject_id + ':' + t.clone.astype(int).astype(str) +
':' + t.passage.astype(int).astype(str))
t = t[t['sample'].apply(lambda x: x in samples.values)]
In [10]:
# These samples are in the 222 cohort and the eQTL analysis.
rna['in_222'] = False
rna.ix[t.index, 'in_222'] = True
rna.ix[t.index, 'in_eqtl'] = True
Now I'll add in any samples for which we have CNVs but weren't in the 222.
In [11]:
samples = (cnv.subject_id + ':' + cnv.clone.astype(int).astype(str) +
':' + cnv.passage.astype(int).astype(str))
t = rna.dropna(subset=['passage'])
t.loc[:, ('sample')] = (t.subject_id + ':' + t.clone.astype(int).astype(str) +
':' + t.passage.astype(int).astype(str))
t = t[t['sample'].apply(lambda x: x in samples.values)]
t = t[t.subject_id.apply(lambda x: x not in rna.ix[rna.in_eqtl, 'subject_id'].values)]
# These samples aren't in the 222 but we have a measured CNV for them.
rna.ix[t.index, 'in_eqtl'] = True
Now I'll add in samples where the clone was in the 222 but we don't have the same passage number.
In [12]:
samples = (cohort222.subject_id + ':' + cohort222.clone.astype(int).astype(str))
t = rna[rna.in_eqtl == False]
t = t[t.subject_id.apply(lambda x: x not in rna.ix[rna.in_eqtl, 'subject_id'].values)]
t['samples'] = t.subject_id + ':' + t.clone.astype(int).astype(str)
t = t[t.samples.apply(lambda x: x in samples.values)]
# These clones are in the 222, we just have a different passage number.
rna['clone_in_222'] = False
rna.ix[rna.in_222, 'clone_in_222'] = True
rna.ix[t.index, 'clone_in_222'] = True
rna.ix[t.index, 'in_eqtl'] = True
Now I'll add in any samples from subjects we don't yet have in the eQTL analysis.
In [13]:
t = rna[rna.in_eqtl == False]
t = t[t.subject_id.apply(lambda x: x not in rna.ix[rna.in_eqtl, 'subject_id'].values)]
rna.ix[t.index, 'in_eqtl'] = True
In [14]:
n = rna.in_eqtl.value_counts()[True]
print('We potentially have {} distinct subjects in the eQTL analysis.'.format(n))
In [15]:
wgs = baseline_wgs.merge(baseline_analyte, left_on='analyte_id', right_index=True,
suffixes=['_wgs', '_analyte'])
wgs = wgs.merge(baseline_tissue, left_on='tissue_id', right_index=True,
suffixes=['_wgs', '_tissue'])
wgs = wgs.merge(baseline_analyte, left_on='analyte_id', right_index=True,
suffixes=['_wgs', '_analyte'])
wgs = wgs.dropna(subset=['wgs_id'])
wgs.index = wgs.wgs_id
wgs['status'] = data_wgs.ix[wgs.index, 'status']
wgs = wgs[wgs.status == 0]
In [16]:
rna['wgs_id'] = ''
In [17]:
for i in rna.index:
s = rna.ix[i, 'subject_id']
t = wgs[wgs.subject_id == s]
if t.shape[0] == 1:
rna.ix[i, 'wgs_id'] = t.index[0]
elif t.shape[0] > 1:
if 'Blood' in t.source.values:
t = t[t.source == 'Blood']
elif 'iPSC' in t.source.values:
t = t[t.source == 'iPSC']
if t.shape[0] == 1:
rna.ix[i, 'wgs_id'] = t.index[0]
else:
print('?: {}'.format(i))
else:
#print('No WGS: {}'.format(i))
print('No WGS: {}'.format(rna.ix[i, 'subject_id']))
rna.ix[i, 'in_eqtl'] = False
rna.ix[rna['wgs_id'] == '', 'wgs_id'] = np.nan
In [18]:
n = rna.in_eqtl.value_counts()[True]
print('We are left with {} subjects for the eQTL analysis.'.format(n))
I'm going to keep one WGS sample per person in the cohort (preferentially blood, fibroblast, and finally iPSC) even if we don't have RNA-seq in case we want to look at phasing etc.
In [19]:
vc = wgs.subject_id.value_counts()
vc = vc[vc > 1]
keep = []
for s in vc.index:
t = wgs[wgs.subject_id == s]
if t.shape[0] == 1:
keep.append(t.index[0])
elif t.shape[0] > 1:
if 'Blood' in t.source.values:
t = t[t.source == 'Blood']
elif 'iPSC' in t.source.values:
t = t[t.source == 'iPSC']
if t.shape[0] == 1:
keep.append(t.index[0])
else:
print('?: {}'.format(i))
wgs = wgs.drop(set(wgs[wgs.subject_id.apply(lambda x: x in vc.index)].index) - set(keep))
In [20]:
wgs = wgs[['source', 'subject_id']]
wgs.columns = ['cell', 'subject_id']
In [21]:
subject = subject_subject.copy(deep=True)
subject = subject.ix[set(rna.subject_id) | set(wgs.subject_id)]
subject = subject[['sex', 'age', 'family_id', 'father_id', 'mother_id',
'twin_id', 'ethnicity_group']]
In [22]:
fn = os.path.join(outdir, 'cnvs.tsv')
if not os.path.exists(fn):
cnv.to_csv(fn, sep='\t')
rna.index.name = 'sample_id'
fn = os.path.join(outdir, 'rnaseq_metadata.tsv')
if not os.path.exists(fn):
rna.to_csv(fn, sep='\t')
fn = os.path.join(outdir, 'subject_metadata.tsv')
if not os.path.exists(fn):
subject.to_csv(fn, sep='\t')
fn = os.path.join(outdir, 'wgs_metadata.tsv')
if not os.path.exists(fn):
wgs.to_csv(fn, sep='\t')
In [24]:
dy = '/projects/CARDIPS/pipeline/RNAseq/combined_files'
# STAR logs.
fn = os.path.join(dy, 'star_logs.tsv')
logs = pd.read_table(fn, index_col=0, low_memory=False)
logs = logs.ix[rna.index]
logs.index.name = 'sample_id'
fn = os.path.join(outdir, 'star_logs.tsv')
if not os.path.exists(fn):
logs.to_csv(fn, sep='\t')
# Picard stats.
fn = os.path.join(dy, 'picard_metrics.tsv')
picard = pd.read_table(fn, index_col=0, low_memory=False)
picard = picard.ix[rna.index]
picard.index.name = 'sample_id'
fn = os.path.join(outdir, 'picard_metrics.tsv')
if not os.path.exists(fn):
picard.to_csv(fn, sep='\t')
# Expression values.
fn = os.path.join(dy, 'rsem_tpm_isoforms.tsv')
tpm = pd.read_table(fn, index_col=0, low_memory=False)
tpm = tpm[rna.index]
fn = os.path.join(outdir, 'rsem_tpm_isoforms.tsv')
if not os.path.exists(fn):
tpm.to_csv(fn, sep='\t')
fn = os.path.join(dy, 'rsem_tpm.tsv')
tpm = pd.read_table(fn, index_col=0, low_memory=False)
tpm = tpm[rna.index]
fn = os.path.join(outdir, 'rsem_tpm.tsv')
if not os.path.exists(fn):
tpm.to_csv(fn, sep='\t')
fn = os.path.join(dy, 'rsem_expected_counts.tsv')
ec = pd.read_table(fn, index_col=0, low_memory=False)
ec = ec[rna.index]
fn = os.path.join(outdir, 'rsem_expected_counts.tsv')
if not os.path.exists(fn):
ec.to_csv(fn, sep='\t')
ec_sf = cpb.analysis.deseq2_size_factors(ec.astype(int), meta=rna, design='~subject_id')
fn = os.path.join(outdir, 'rsem_expected_counts_size_factors.tsv')
if not os.path.exists(fn):
ec_sf.to_csv(fn, sep='\t')
ec_n = (ec / ec_sf)
fn = os.path.join(outdir, 'rsem_expected_counts_norm.tsv')
if not os.path.exists(fn):
ec_n.to_csv(fn, sep='\t')
fn = os.path.join(dy, 'gene_counts.tsv')
gc = pd.read_table(fn, index_col=0, low_memory=False)
gc = gc[rna.index]
fn = os.path.join(outdir, 'gene_counts.tsv')
if not os.path.exists(fn):
gc.to_csv(fn, sep='\t')
gc_sf = cpb.analysis.deseq2_size_factors(gc, meta=rna, design='~subject_id')
fn = os.path.join(outdir, 'gene_counts_size_factors.tsv')
if not os.path.exists(fn):
gc_sf.to_csv(fn, sep='\t')
gc_n = (gc / gc_sf)
fn = os.path.join(outdir, 'gene_counts_norm.tsv')
if not os.path.exists(fn):
gc_n.to_csv(fn, sep='\t')
In [25]:
# Allele counts.
cpy.makedir(os.path.join(private_outdir, 'allele_counts'))
fns = glob.glob('/projects/CARDIPS/pipeline/RNAseq/sample/'
'*/*mbased/*mbased_input.tsv')
fns = [x for x in fns if x.split('/')[-3] in rna.index]
for fn in fns:
new_fn = os.path.join(private_outdir, 'allele_counts', os.path.split(fn)[1])
if not os.path.exists(new_fn):
os.symlink(fn, new_fn)
# MBASED ASE results.
dy = '/projects/CARDIPS/pipeline/RNAseq/combined_files'
df = pd.read_table(os.path.join(dy, 'mbased_major_allele_freq.tsv'), index_col=0)
df = df[rna.index].dropna(how='all')
df.to_csv(os.path.join(outdir, 'mbased_major_allele_freq.tsv'), sep='\t')
df = pd.read_table(os.path.join(dy, 'mbased_p_val_ase.tsv'), index_col=0)
df = df[rna.index].dropna(how='all')
df.to_csv(os.path.join(outdir, 'mbased_p_val_ase.tsv'), sep='\t')
df = pd.read_table(os.path.join(dy, 'mbased_p_val_het.tsv'), index_col=0)
df = df[rna.index].dropna(how='all')
df.to_csv(os.path.join(outdir, 'mbased_p_val_het.tsv'), sep='\t')
cpy.makedir(os.path.join(private_outdir, 'mbased_snv'))
fns = glob.glob('/projects/CARDIPS/pipeline/RNAseq/sample/*/*mbased/*_snv.tsv')
fns = [x for x in fns if x.split('/')[-3] in rna.index]
for fn in fns:
new_fn = os.path.join(private_outdir, 'mbased_snv', os.path.split(fn)[1])
if not os.path.exists(new_fn):
os.symlink(fn, new_fn)
In [26]:
fn = os.path.join(private_outdir, 'autosomal_variants.vcf.gz')
if not os.path.exists(fn):
os.symlink('/projects/CARDIPS/pipeline/WGS/mergedVCF/CARDIPS_201512.PASS.vcf.gz',
fn)
os.symlink('/projects/CARDIPS/pipeline/WGS/mergedVCF/CARDIPS_201512.PASS.vcf.gz.tbi',
fn + '.tbi')
In [5]:
fn = os.path.join(outdir, 'GSE73211.tsv')
if not os.path.exists(fn):
os.symlink('/projects/CARDIPS/pipeline/RNAseq/combined_files/GSE73211.tsv', fn)
GSE73211 = pd.read_table(fn, index_col=0)
dy = '/projects/CARDIPS/pipeline/RNAseq/combined_files'
fn = os.path.join(dy, 'rsem_tpm.tsv')
tpm = pd.read_table(fn, index_col=0, low_memory=False)
tpm = tpm[GSE73211.index]
fn = os.path.join(outdir, 'GSE73211_tpm.tsv')
if not os.path.exists(fn):
tpm.to_csv(fn, sep='\t')